library(tidyverse)
library(psych)
library(readxl)
library(lme4)
source("./scripts/max_factors_efa.R")
source("./scripts/reten_fun.R")
source("./scripts/plot_fun.R")
d <- read.csv("/Users/kweisman/Documents/Research (Stanford)/Projects/Dimkid/dimkid/data/children/run-05_2018-08-07_anonymized.csv")[-1]
question_key <- read_excel("/Users/kweisman/Documents/Research (Stanford)/Projects/Dimkid/design/dimkid4yo (spring 2018)/dimkid4yo versions SAVE (4yo version spring 2018).xlsx") %>%
  select(`Question:`, `Clarification (opt.):`, starts_with("v")) %>%
  rename(question_text = `Question:`,
         question_clar = `Clarification (opt.):`) %>%
  gather(version, question, starts_with("v")) %>%
  mutate(version = as.numeric(gsub("v", "", version)),
         capacity = gsub("Can ___s ", "", question_text),
         capacity = gsub("\\?", "", capacity))
d1 <- d %>%
  left_join(question_key) %>%
  select(run:question, question_text:capacity, response:general_comments) %>%
  filter(!is.na(character), !is.na(capacity))
Joining, by = c("version", "question")
d_efa <- d1 %>%
  select(subid_char, capacity, response_num) %>%
  spread(capacity, response_num) %>%
  column_to_rownames("subid_char")

EFA: All

How many factors to retain?

fa.parallel(d_efa)
 A loading greater than abs(1) was detected.  Examine the loadings carefully.The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
An ultra-Heywood case was detected.  Examine the results carefully
Parallel analysis suggests that the number of factors =  4  and the number of components =  3 

VSS(d_efa, rotate = "oblimin")
 A loading greater than abs(1) was detected.  Examine the loadings carefully.The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
 A loading greater than abs(1) was detected.  Examine the loadings carefully.The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
An ultra-Heywood case was detected.  Examine the results carefully

Very Simple Structure
Call: vss(x = x, n = n, rotate = rotate, diagonal = diagonal, fm = fm, 
    n.obs = n.obs, plot = plot, title = title, use = use, cor = cor)
VSS complexity 1 achieves a maximimum of 0.57  with  1  factors
VSS complexity 2 achieves a maximimum of 0.62  with  3  factors

The Velicer MAP achieves a minimum of 0.02  with  1  factors 
BIC achieves a minimum of  -411.19  with  1  factors
Sample Size adjusted BIC achieves a minimum of  -33.88  with  5  factors

Statistics by number of factors 
  vss1 vss2   map dof chisq    prob sqresid  fit RMSEA  BIC SABIC
1 0.57 0.00 0.021 135   208 5.8e-05      14 0.57 0.083 -411    15
2 0.51 0.59 0.022 118   155 1.3e-02      14 0.59 0.067 -386   -14
3 0.48 0.62 0.023 102   115 1.7e-01      12 0.66 0.050 -352   -30
4 0.40 0.61 0.028  87    93 3.1e-01      12 0.66 0.044 -306   -31
5 0.40 0.53 0.033  73    70 5.7e-01      11 0.67 0.028 -264   -34
6 0.38 0.53 0.039  60    53 7.2e-01      11 0.68 0.000 -222   -32
7 0.36 0.46 0.047  48    36 8.9e-01      11 0.67 0.000 -184   -32
8 0.38 0.48 0.055  37    24 9.6e-01      11 0.68 0.000 -146   -29
  complex eChisq  SRMR eCRMS eBIC
1     1.0    301 0.100 0.107 -318
2     1.2    186 0.079 0.090 -355
3     1.5    114 0.062 0.076 -353
4     1.9     81 0.052 0.069 -318
5     2.0     54 0.042 0.061 -281
6     2.1     36 0.035 0.055 -239
7     2.3     23 0.028 0.050 -197
8     2.4     13 0.021 0.042 -157

reten_fun(d_efa, "oblimin")
[1] 5

5 factors

efa5 <- fa(d_efa, nfactors = 5, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa5)
Joining, by = "capacity"
Joining, by = "factor"

scoresplot_fun(efa5, target = "all")
Ignoring unknown aesthetics: y

itemsplot_fun(efa5, target = "all")
Joining, by = "capacity"
Joining, by = c("capacity", "factor", "order")

4 factors

efa4 <- fa(d_efa, nfactors = 4, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa4)
Joining, by = "capacity"
Joining, by = "factor"

scoresplot_fun(efa4, target = "all")
Ignoring unknown aesthetics: y

itemsplot_fun(efa4, target = "all")
Joining, by = "capacity"
Joining, by = c("capacity", "factor", "order")

3 factors

efa3 <- fa(d_efa, nfactors = 3, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa3)
Joining, by = "capacity"
Joining, by = "factor"

scoresplot_fun(efa3, target = "all")
Ignoring unknown aesthetics: y

itemsplot_fun(efa3, target = "all")
Joining, by = "capacity"
Joining, by = c("capacity", "factor", "order")

2 factors

efa2 <- fa(d_efa, nfactors = 2, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa2)
Joining, by = "capacity"
Joining, by = "factor"

scoresplot_fun(efa2, target = "all")
Ignoring unknown aesthetics: y

itemsplot_fun(efa2, target = "all")
Joining, by = "capacity"
Joining, by = c("capacity", "factor", "order")

EFA: By game

d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  count(game, character) %>%
  group_by(game) %>%
  mutate(prop = n/sum(n))
Joining, by = c("character", "subid")
Column `character` joining character vector and factor, coercing into character vectorColumn `subid` joining character vector and factor, coercing into character vector
game1 <- d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  filter(game == 1) %>%
  select(-c(character, subid, game)) %>%
  column_to_rownames("subid_char")
Joining, by = c("character", "subid")
Column `character` joining character vector and factor, coercing into character vectorColumn `subid` joining character vector and factor, coercing into character vector
fa.parallel(game1)
Parallel analysis suggests that the number of factors =  3  and the number of components =  2 

efa_game1 <- fa(game1, nfactors = 3, rotate = "oblimin") %>% fa.sort()
convergence not obtained in GPFoblq. 1000 iterations used.
heatmap_fun(efa_game1)
Joining, by = "capacity"
Joining, by = "factor"

game2 <- d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  filter(game == 2) %>%
  select(-c(character, subid, game)) %>%
  column_to_rownames("subid_char")
Joining, by = c("character", "subid")
Column `character` joining character vector and factor, coercing into character vectorColumn `subid` joining character vector and factor, coercing into character vector
fa.parallel(game2)
 A loading greater than abs(1) was detected.  Examine the loadings carefully.The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
An ultra-Heywood case was detected.  Examine the results carefully A loading greater than abs(1) was detected.  Examine the loadings carefully.The estimated weights for the factor scores are probably incorrect.  Try a different factor extraction method.
An ultra-Heywood case was detected.  Examine the results carefully
Parallel analysis suggests that the number of factors =  4  and the number of components =  2 

efa_game2 <- fa(game2, nfactors = 4, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa_game2)
Joining, by = "capacity"
Joining, by = "factor"

Regressions

d_diff <- d1 %>%
  mutate(factor = case_when(
    capacity %in% c("feel happy", "feel sorry", "get lonely",
                    "get sad", "hate someone", "love someone") ~ "HEART",
    capacity %in% c("feel hungry", "feel sick", "feel tired",
                    "get scared", "get thirsty", "smell things") ~ "BODY",
    capacity %in% c("figure things out", "hear", "know stuff",
                    "remember things", "see", "think") ~ "MIND",
    TRUE ~ "NA")) %>%
  group_by(subid, age_years, character, factor) %>%
  summarise(total = sum(response_num)) %>%
  ungroup() %>%
  spread(factor, total) %>%
  mutate(BminH = BODY - HEART,
         BminM = BODY - MIND,
         MminH = MIND - HEART) %>%
  select(subid, age_years, character, BminH, BminM, MminH) %>%
  gather(comparison, diff, c(BminH, BminM, MminH)) %>%
  mutate(comparison = factor(comparison),
         diff2 = diff * 2,
         age_years_cent = scale(age_years, scale = T),
         character = factor(character, levels = c("beetle", "robot")))
contrasts(d_diff$character) <- cbind(robot_GM = c(-1, 1))
ggplot(d_diff, aes(x = diff, fill = character)) +
  facet_grid(character ~ comparison) +
  geom_histogram(binwidth = 1) +
  geom_vline(xintercept = 0, lty = 2, color = "black") +
  theme_bw()

ggplot(d_diff, aes(x = age_years, y = diff, color = character)) +
                 # color = character, fill = character)) +
                 # color = comparison, fill = comparison)) +
  facet_grid(~ comparison) +
  geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
  geom_point(alpha = 0.5) +
  geom_smooth(aes(group = comparison), color = "black",
              method = "lm", alpha = 0.25) +
  theme_bw()

ggplot(d_diff, aes(x = age_years, y = abs(diff), color = character)) +
  # color = character, fill = character)) +
  # color = comparison, fill = comparison)) +
  facet_grid(~ comparison) +
  geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
  geom_point(alpha = 0.5) +
  geom_smooth(aes(group = comparison), color = "black",
              method = "lm", alpha = 0.25) +
  theme_bw()

r_d_diffBminH <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "BminH"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffBminH)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: poisson  ( log )
Formula: abs(diff2) ~ age_years_cent + (1 | subid) + (age_years_cent |  
    character)
   Data: d_diff %>% filter(comparison == "BminH")
Control: glmerControl(optimizer = "bobyqa")

     AIC      BIC   logLik deviance df.resid 
   408.2    423.5   -198.1    396.2       88 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.5898 -0.7534 -0.1284  0.5414  2.5282 

Random effects:
 Groups    Name           Variance  Std.Dev. Corr 
 subid     (Intercept)    0.1462600 0.38244       
 character (Intercept)    0.0157481 0.12549       
           age_years_cent 0.0009871 0.03142  -1.00
Number of obs: 94, groups:  subid, 48; character, 2

Fixed effects:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)      0.9706     0.1245   7.795 6.45e-15 ***
age_years_cent   0.1537     0.0872   1.762    0.078 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr)
age_yrs_cnt -0.236
r_d_diffBminM <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "BminM"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffBminM)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: poisson  ( log )
Formula: abs(diff2) ~ age_years_cent + (1 | subid) + (age_years_cent |  
    character)
   Data: d_diff %>% filter(comparison == "BminM")
Control: glmerControl(optimizer = "bobyqa")

     AIC      BIC   logLik deviance df.resid 
   428.5    443.7   -208.2    416.5       88 

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-1.95124 -0.97732 -0.00804  0.55469  2.77991 

Random effects:
 Groups    Name           Variance Std.Dev. Corr
 subid     (Intercept)    0.270989 0.52057      
 character (Intercept)    0.006126 0.07827      
           age_years_cent 0.018322 0.13536  1.00
Number of obs: 94, groups:  subid, 48; character, 2

Fixed effects:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.91652    0.11740   7.807 5.87e-15 ***
age_years_cent  0.02916    0.13765   0.212    0.832    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr)
age_yrs_cnt 0.307 
r_d_diffMminH <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "MminH"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffMminH)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: poisson  ( log )
Formula: abs(diff2) ~ age_years_cent + (1 | subid) + (age_years_cent |  
    character)
   Data: d_diff %>% filter(comparison == "MminH")
Control: glmerControl(optimizer = "bobyqa")

     AIC      BIC   logLik deviance df.resid 
   407.9    423.2   -197.9    395.9       88 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.5706 -0.9441 -0.1916  0.7781  2.2496 

Random effects:
 Groups    Name           Variance  Std.Dev.  Corr
 subid     (Intercept)    2.844e-01 5.333e-01     
 character (Intercept)    0.000e+00 0.000e+00     
           age_years_cent 1.060e-17 3.256e-09  NaN
Number of obs: 94, groups:  subid, 48; character, 2

Fixed effects:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)     0.77742    0.10986   7.076 1.48e-12 ***
age_years_cent -0.08904    0.10278  -0.866    0.386    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Correlation of Fixed Effects:
            (Intr)
age_yrs_cnt 0.033 

Demographics

d %>% distinct(subid) %>% count() %>% data.frame()
   n
1 50
d %>% distinct(subid, gender) %>% count(gender) %>% data.frame()
  gender  n
1      f 17
2      m 11
3   <NA> 22
d %>% distinct(subid, age_years) %>% 
  summarise(mean = mean(age_years, na.rm = T),
            sd = sd(age_years, na.rm = T),
            median = median(age_years, na.rm = T)) %>% data.frame()
      mean        sd   median
1 4.627507 0.4905214 4.665753
d %>% distinct(subid, age_years) %>% 
  ggplot(aes(x = age_years)) +
  geom_histogram(binwidth = 2/12) +
  geom_vline(xintercept = median(d$age_years), lty = 2, color = "blue") +
  theme_bw()

d %>% distinct(subid, ethnicity_collapse) %>% count(ethnicity_collapse) %>% data.frame()
  ethnicity_collapse  n
1                  A  2
2                  C 13
3                 Cj  1
4                  H  1
5                  I  1
6           multiple 10
7               <NA> 22
d %>% distinct(subid, experimenter) %>% count(experimenter) %>% data.frame()
  experimenter  n
1           cx 13
2           kz 17
3           na 20
---
title: "Dimkid4yo"
output: 
  html_notebook:
    toc: true
    toc_float: true
---

```{r}
library(tidyverse)
library(psych)
library(readxl)
library(lme4)
```

```{r}
source("./scripts/max_factors_efa.R")
source("./scripts/reten_fun.R")
source("./scripts/plot_fun.R")
```

```{r}
d <- read.csv("/Users/kweisman/Documents/Research (Stanford)/Projects/Dimkid/dimkid/data/children/run-05_2018-08-07_anonymized.csv")[-1]
```

```{r}
question_key <- read_excel("/Users/kweisman/Documents/Research (Stanford)/Projects/Dimkid/design/dimkid4yo (spring 2018)/dimkid4yo versions SAVE (4yo version spring 2018).xlsx") %>%
  select(`Question:`, `Clarification (opt.):`, starts_with("v")) %>%
  rename(question_text = `Question:`,
         question_clar = `Clarification (opt.):`) %>%
  gather(version, question, starts_with("v")) %>%
  mutate(version = as.numeric(gsub("v", "", version)),
         capacity = gsub("Can ___s ", "", question_text),
         capacity = gsub("\\?", "", capacity))
```

```{r}
d1 <- d %>%
  left_join(question_key) %>%
  select(run:question, question_text:capacity, response:general_comments) %>%
  filter(!is.na(character), !is.na(capacity))
```

```{r}
d_efa <- d1 %>%
  select(subid_char, capacity, response_num) %>%
  spread(capacity, response_num) %>%
  column_to_rownames("subid_char")
```


# EFA: All

## How many factors to retain?

```{r}
fa.parallel(d_efa)
```

```{r}
VSS(d_efa, rotate = "oblimin")
```

```{r}
reten_fun(d_efa, "oblimin")
```

## 5 factors

```{r}
efa5 <- fa(d_efa, nfactors = 5, rotate = "oblimin") %>% fa.sort()
```

```{r}
heatmap_fun(efa5)
```

```{r}
scoresplot_fun(efa5, target = "all")
```

```{r, fig.width = 3, fig.asp = 1}
itemsplot_fun(efa5, target = "all")
```


## 4 factors

```{r}
efa4 <- fa(d_efa, nfactors = 4, rotate = "oblimin") %>% fa.sort()
```

```{r}
heatmap_fun(efa4)
```

```{r}
scoresplot_fun(efa4, target = "all")
```

```{r, fig.width = 3, fig.asp = 1}
itemsplot_fun(efa4, target = "all")
```


## 3 factors

```{r}
efa3 <- fa(d_efa, nfactors = 3, rotate = "oblimin") %>% fa.sort()
```

```{r}
heatmap_fun(efa3)
```

```{r}
scoresplot_fun(efa3, target = "all")
```

```{r, fig.width = 3, fig.asp = 1}
itemsplot_fun(efa3, target = "all")
```


## 2 factors

```{r}
efa2 <- fa(d_efa, nfactors = 2, rotate = "oblimin") %>% fa.sort()
```

```{r}
heatmap_fun(efa2)
```

```{r}
scoresplot_fun(efa2, target = "all")
```

```{r, fig.width = 3, fig.asp = 1}
itemsplot_fun(efa2, target = "all")
```



# EFA: By game #

```{r}
d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  count(game, character) %>%
  group_by(game) %>%
  mutate(prop = n/sum(n))
```


```{r}
game1 <- d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  filter(game == 1) %>%
  select(-c(character, subid, game)) %>%
  column_to_rownames("subid_char")

fa.parallel(game1)
efa_game1 <- fa(game1, nfactors = 3, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa_game1)
```

```{r}
game2 <- d_efa %>%
  rownames_to_column("subid_char") %>%
  mutate(character = gsub("^.*_", "", subid_char),
         subid = gsub("_.*$", "", subid_char)) %>%
  left_join(d1 %>% distinct(subid, character, game)) %>%
  filter(game == 2) %>%
  select(-c(character, subid, game)) %>%
  column_to_rownames("subid_char")

fa.parallel(game2)
efa_game2 <- fa(game2, nfactors = 4, rotate = "oblimin") %>% fa.sort()
heatmap_fun(efa_game2)
```


# Regressions

```{r}
d_diff <- d1 %>%
  mutate(factor = case_when(
    capacity %in% c("feel happy", "feel sorry", "get lonely",
                    "get sad", "hate someone", "love someone") ~ "HEART",
    capacity %in% c("feel hungry", "feel sick", "feel tired",
                    "get scared", "get thirsty", "smell things") ~ "BODY",
    capacity %in% c("figure things out", "hear", "know stuff",
                    "remember things", "see", "think") ~ "MIND",
    TRUE ~ "NA")) %>%
  group_by(subid, age_years, character, factor) %>%
  summarise(total = sum(response_num)) %>%
  ungroup() %>%
  spread(factor, total) %>%
  mutate(BminH = BODY - HEART,
         BminM = BODY - MIND,
         MminH = MIND - HEART) %>%
  select(subid, age_years, character, BminH, BminM, MminH) %>%
  gather(comparison, diff, c(BminH, BminM, MminH)) %>%
  mutate(comparison = factor(comparison),
         diff2 = diff * 2,
         age_years_cent = scale(age_years, scale = T),
         character = factor(character, levels = c("beetle", "robot")))

contrasts(d_diff$character) <- cbind(robot_GM = c(-1, 1))
```

```{r}
ggplot(d_diff, aes(x = diff, fill = character)) +
  facet_grid(character ~ comparison) +
  geom_histogram(binwidth = 1) +
  geom_vline(xintercept = 0, lty = 2, color = "black") +
  theme_bw()
```

```{r, fig.width = 3, fig.asp = 0.5}
ggplot(d_diff, aes(x = age_years, y = diff, color = character)) +
                 # color = character, fill = character)) +
                 # color = comparison, fill = comparison)) +
  facet_grid(~ comparison) +
  geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
  geom_point(alpha = 0.5) +
  geom_smooth(aes(group = comparison), color = "black",
              method = "lm", alpha = 0.25) +
  theme_bw()
```

```{r, fig.width = 3, fig.asp = 0.5}
ggplot(d_diff, aes(x = age_years, y = abs(diff), color = character)) +
  # color = character, fill = character)) +
  # color = comparison, fill = comparison)) +
  facet_grid(~ comparison) +
  geom_hline(yintercept = 0, lty = 2, color = "darkgray") +
  geom_point(alpha = 0.5) +
  geom_smooth(aes(group = comparison), color = "black",
              method = "lm", alpha = 0.25) +
  theme_bw()
```

```{r}
r_d_diffBminH <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "BminH"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffBminH)
```

```{r}
r_d_diffBminM <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "BminM"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffBminM)
```

```{r}
r_d_diffMminH <- lme4::glmer(abs(diff2) ~ age_years_cent +
                             (1 | subid) + (age_years_cent | character),
                           d_diff %>% filter(comparison == "MminH"),
                           family = "poisson",
                           control = glmerControl(optimizer = "bobyqa"))
summary(r_d_diffMminH)
```



# Demographics

```{r}
d %>% distinct(subid) %>% count() %>% data.frame()
```

```{r}
d %>% distinct(subid, gender) %>% count(gender) %>% data.frame()
```

```{r}
d %>% distinct(subid, age_years) %>% 
  summarise(mean = mean(age_years, na.rm = T),
            sd = sd(age_years, na.rm = T),
            median = median(age_years, na.rm = T)) %>% data.frame()
```

```{r}
d %>% distinct(subid, age_years) %>% 
  ggplot(aes(x = age_years)) +
  geom_histogram(binwidth = 2/12) +
  geom_vline(xintercept = median(d$age_years), lty = 2, color = "blue") +
  theme_bw()
```

```{r}
d %>% distinct(subid, ethnicity_collapse) %>% count(ethnicity_collapse) %>% data.frame()
```

```{r}
d %>% distinct(subid, experimenter) %>% count(experimenter) %>% data.frame()
```

